In [ ]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")
In [ ]:
#Importing the dataset in the notebook
data = pd.read_csv('../../Spotify Music Dataset/data/data.csv')
genre_data = pd.read_csv('../../Spotify Music Dataset/data_by_genres.csv')
year_data = pd.read_csv('../../Spotify Music Dataset/data_by_year.csv')
In [3]:
data.describe()
Out[3]:
valence year acousticness danceability duration_ms energy explicit instrumentalness key liveness loudness mode popularity speechiness tempo
count 170653.000000 170653.000000 170653.000000 170653.000000 1.706530e+05 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000 170653.000000
mean 0.528587 1976.787241 0.502115 0.537396 2.309483e+05 0.482389 0.084575 0.167010 5.199844 0.205839 -11.467990 0.706902 31.431794 0.098393 116.861590
std 0.263171 25.917853 0.376032 0.176138 1.261184e+05 0.267646 0.278249 0.313475 3.515094 0.174805 5.697943 0.455184 21.826615 0.162740 30.708533
min 0.000000 1921.000000 0.000000 0.000000 5.108000e+03 0.000000 0.000000 0.000000 0.000000 0.000000 -60.000000 0.000000 0.000000 0.000000 0.000000
25% 0.317000 1956.000000 0.102000 0.415000 1.698270e+05 0.255000 0.000000 0.000000 2.000000 0.098800 -14.615000 0.000000 11.000000 0.034900 93.421000
50% 0.540000 1977.000000 0.516000 0.548000 2.074670e+05 0.471000 0.000000 0.000216 5.000000 0.136000 -10.580000 1.000000 33.000000 0.045000 114.729000
75% 0.747000 1999.000000 0.893000 0.668000 2.624000e+05 0.703000 0.000000 0.102000 8.000000 0.261000 -7.183000 1.000000 48.000000 0.075600 135.537000
max 1.000000 2020.000000 0.996000 0.988000 5.403500e+06 1.000000 1.000000 1.000000 11.000000 1.000000 3.855000 1.000000 100.000000 0.970000 243.507000
In [4]:
year_data.head()
Out[4]:
mode year acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo valence popularity key
0 1 1921 0.886896 0.418597 260537.166667 0.231815 0.344878 0.205710 -17.048667 0.073662 101.531493 0.379327 0.653333 2
1 1 1922 0.938592 0.482042 165469.746479 0.237815 0.434195 0.240720 -19.275282 0.116655 100.884521 0.535549 0.140845 10
2 1 1923 0.957247 0.577341 177942.362162 0.262406 0.371733 0.227462 -14.129211 0.093949 114.010730 0.625492 5.389189 0
3 1 1924 0.940200 0.549894 191046.707627 0.344347 0.581701 0.235219 -14.231343 0.092089 120.689572 0.663725 0.661017 10
4 1 1925 0.962607 0.573863 184986.924460 0.278594 0.418297 0.237668 -14.146414 0.111918 115.521921 0.621929 2.604317 5
In [5]:
genre_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      170653 non-null  object 
 17  speechiness       170653 non-null  float64
 18  tempo             170653 non-null  float64
dtypes: float64(9), int64(6), object(4)
memory usage: 24.7+ MB
In [7]:
data['duration_ms'] = data['duration_ms'].apply(lambda x:x/60000).round(1)

data.rename(columns={'duration_ms':'duration_min'},inplace = True)
data.head()
Out[7]:
valence year acousticness artists danceability duration_min energy explicit id instrumentalness key liveness loudness mode name popularity release_date speechiness tempo
0 0.0594 1921 0.982 ['Sergei Rachmaninoff', 'James Levine', 'Berli... 0.279 13.9 0.211 0 4BJqT0PrAfrxzMOxytFOIz 0.878000 10 0.665 -20.096 1 Piano Concerto No. 3 in D Minor, Op. 30: III. ... 4 1921 0.0366 80.954
1 0.9630 1921 0.732 ['Dennis Day'] 0.819 3.0 0.341 0 7xPhfUan2yNtyFG0cUWkt8 0.000000 7 0.160 -12.441 1 Clancy Lowered the Boom 5 1921 0.4150 60.936
2 0.0394 1921 0.961 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... 0.328 8.3 0.166 0 1o6I8BglA6ylDMrIELygv1 0.913000 3 0.101 -14.850 1 Gati Bali 5 1921 0.0339 110.339
3 0.1650 1921 0.967 ['Frank Parker'] 0.275 3.5 0.309 0 3ftBPsC5vPBKxYSee08FDH 0.000028 5 0.381 -9.316 1 Danny Boy 3 1921 0.0354 100.109
4 0.2530 1921 0.957 ['Phil Regan'] 0.418 2.8 0.193 0 4d6HGyGT8e121BsdKmw9v6 0.000002 3 0.229 -10.096 1 When Irish Eyes Are Smiling 2 1921 0.0380 101.665
In [8]:
data = data.drop(columns=['id','popularity','release_date','mode'])
data.head()
Out[8]:
valence year acousticness artists danceability duration_min energy explicit instrumentalness key liveness loudness name speechiness tempo
0 0.0594 1921 0.982 ['Sergei Rachmaninoff', 'James Levine', 'Berli... 0.279 13.9 0.211 0 0.878000 10 0.665 -20.096 Piano Concerto No. 3 in D Minor, Op. 30: III. ... 0.0366 80.954
1 0.9630 1921 0.732 ['Dennis Day'] 0.819 3.0 0.341 0 0.000000 7 0.160 -12.441 Clancy Lowered the Boom 0.4150 60.936
2 0.0394 1921 0.961 ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... 0.328 8.3 0.166 0 0.913000 3 0.101 -14.850 Gati Bali 0.0339 110.339
3 0.1650 1921 0.967 ['Frank Parker'] 0.275 3.5 0.309 0 0.000028 5 0.381 -9.316 Danny Boy 0.0354 100.109
4 0.2530 1921 0.957 ['Phil Regan'] 0.418 2.8 0.193 0 0.000002 3 0.229 -10.096 When Irish Eyes Are Smiling 0.0380 101.665
In [9]:
data.isnull().sum()
Out[9]:
valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_min        0
energy              0
explicit            0
instrumentalness    0
key                 0
liveness            0
loudness            0
name                0
speechiness         0
tempo               0
dtype: int64
In [10]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)
In [11]:
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.005s...
[t-SNE] Computed neighbors for 2973 samples in 0.409s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106087
[t-SNE] KL divergence after 1000 iterations: 1.391694
In [12]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
In [13]:
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()
In [14]:
rmse_x=[]
mae_y=[]
r2_y=[]
rmse_y=[]

Linear Regression Model¶

In [15]:
#Performance evaluation Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = genre_data.drop('genres',axis = 1)
y = genre_data.popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)

#print("We have the model2 coefficients as : ",model.coef_)
print("we have the model2 intercept as :",  model.intercept_)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean absolute error is : ', mae)
print('RMSE: ', rmse)

print("MSE is ",mean_squared_error(y_test, y_pred))
print("R2 score is",r2_score(y_pred, y_test))

rmse_x.append('Linear')
mae_y.append(mae)
r2_y.append(r2_score(y_pred, y_test))
rmse_y.append(rmse)
we have the model2 intercept as : 1.0658141036401503e-13
Mean absolute error is :  2.0486135990711773e-14
RMSE:  2.625128862461853e-14
MSE is  6.8913015445302635e-28
R2 score is 1.0

Polynomial Regression Model¶

In [16]:
# Performance Evaluation Polynomial Regression Model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = genre_data.drop('genres',axis = 1)
y = genre_data.popularity

poly = PolynomialFeatures(degree=5, include_bias=False)
poly_features = poly.fit_transform(X)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=42)

model = LinearRegression().fit(X_train, y_train)

y_pred = model.predict(X_test)

#print("We have the model2 coefficients as : ",model.coef_)
print("we have the model2 intercept as :",  model.intercept_)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean absolute error is : ', mae)
print('RMSE: ', rmse)

print("MSE is ",mean_squared_error(y_test, y_pred))
print("R2 score is",r2_score(y_pred, y_test))

rmse_x.append('Polynomial')
mae_y.append(mae)
r2_y.append(r2_score(y_pred, y_test))
rmse_y.append(rmse)
we have the model2 intercept as : 25.591631936431384
Mean absolute error is :  4.971091411901953
RMSE:  10.47069382191658
MSE is  109.63542911232204
R2 score is 0.659288677137343

KNN Regression Model¶

In [17]:
#Performance evaluation KNN Regression Model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = genre_data.drop('genres',axis = 1)
y = genre_data.popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

model = KNeighborsRegressor(n_neighbors=10).fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean absolute error is : ', mae)
print('RMSE: ', rmse)

print("MSE is ",mean_squared_error(y_test, y_pred))
print("R2 score is",r2_score(y_pred, y_test))

rmse_x.append('KNN')
mae_y.append(mae)
r2_y.append(r2_score(y_pred, y_test))
rmse_y.append(rmse)
Mean absolute error is :  13.199291241148623
RMSE:  17.223746911705458
MSE is  296.65745767848335
R2 score is -5.0722240317196885

Random Forest Regression Model¶

In [18]:
#Performance evaluation RandomForest Regression Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = genre_data.drop('genres',axis = 1)
y = genre_data.popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

model = RandomForestRegressor(n_estimators = 200, random_state = 10).fit(X_train, y_train)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean absolute error is : ', mae)
print('RMSE: ', rmse)

print("MSE is ",mean_squared_error(y_test, y_pred))
print("R2 score is",r2_score(y_pred, y_test))

rmse_x.append('RandomForest')
mae_y.append(mae)
r2_y.append(r2_score(y_pred, y_test))
rmse_y.append(rmse)
Mean absolute error is :  0.025416102885776427
RMSE:  0.06654311342031904
MSE is  0.004427985943669445
R2 score is 0.9999842273484254
In [19]:
pd.DataFrame({'Column': X_train.columns, 'Feature Importance': model.feature_importances_})
Out[19]:
Column Feature Importance
0 mode 4.106967e-07
1 acousticness 3.249349e-06
2 danceability 4.220754e-06
3 duration_ms 3.572716e-06
4 energy 2.762728e-06
5 instrumentalness 3.365163e-06
6 liveness 2.441553e-06
7 loudness 2.707293e-06
8 speechiness 3.332215e-06
9 tempo 2.896989e-06
10 valence 2.200417e-06
11 popularity 9.999653e-01
12 key 2.116160e-06
13 cluster 1.418885e-06

Lasso Regression Model¶

In [20]:
#Performance evaluation Lasso Regression Model
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X = genre_data.drop('genres',axis = 1)
y = genre_data.popularity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

scaler = StandardScaler().fit(X_train) 

X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)

model = Lasso(alpha=1).fit(X_train, y_train)

y_pred = model.predict(X_test)

#print("We have the model2 coefficients as : ",model.coef_)
print("we have the model2 intercept as :",  model.intercept_)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean absolute error is : ', mae)
print('RMSE: ', rmse)

print("MSE is ",mean_squared_error(y_test, y_pred))
print("R2 score is",r2_score(y_pred, y_test))

rmse_x.append('Lasso')
mae_y.append(mae)
r2_y.append(r2_score(y_pred, y_test))
rmse_y.append(rmse)
we have the model2 intercept as : 39.80992289692062
Mean absolute error is :  0.7841299425755205
RMSE:  1.0015376218892358
MSE is  1.0030776080595458
R2 score is 0.9959608007291691
In [21]:
df_algo_error = pd.DataFrame({'Algorithm': rmse_x, 'Error Values': rmse_y})
df_algo_error.sort_values('Error Values',inplace=True)
df_algo_error
Out[21]:
Algorithm Error Values
0 Linear 2.625129e-14
3 RandomForest 6.654311e-02
4 Lasso 1.001538e+00
1 Polynomial 1.047069e+01
2 KNN 1.722375e+01
In [22]:
plt.figure(figsize=(12,4))
plt.barh(y=df_algo_error['Algorithm'], width=df_algo_error['Error Values'],left=df_algo_error['Error Values'])
plt.title('Gantt Chart for MSE Error of various Algorithms', fontsize=15)
plt.show()
In [23]:
plt.figure(figsize=(12,8))
plt.plot(rmse_x, rmse_y, label = "Root Mean Squared Error")
plt.plot(rmse_x, mae_y, label = "Mean Absolute Error")
plt.legend()
Out[23]:
<matplotlib.legend.Legend at 0x16f91368ac0>
In [24]:
plt.plot(rmse_x, r2_y, label = "R2 Score")
plt.legend()
Out[24]:
<matplotlib.legend.Legend at 0x16f913ae050>
In [ ]: